The dataset can be found at https://www.kaggle.com/avikasliwal/used-cars-price-prediction#train-data.csv
It has following parameters:
Name: The brand and model of the car.
Location: The location in which the car is being sold or is available for purchase.
Year: The year or edition of the model.
Kilometers_Driven: The total kilometres driven in the car by the previous owner(s) in KM.
Fuel_Type: The type of fuel used by the car. (Petrol / Diesel / Electric / CNG / LPG)
Transmission: The type of transmission used by the car. (Automatic / Manual)
Owner_Type: Whether the ownership is Firsthand, Second hand or other.
Mileage: The standard mileage offered by the car company in kmpl or km/kg
Engine: The displacement volume of the engine in cc.
Power: The maximum power of the engine in bhp.
Seats: The number of seats in the car.
New_Price: The price of a new car of the same model.
Price: The price of the used car in INR Lakhs.
The price prediction for used cars is a very important part of used car business. Predicting prices accurately can help businesses and customers to settle deal at a fair price. Vehicle price prediction especially when the vehicle is used and not coming direct from the factory, is both a critical and important task. With increase in demand for used cars and upto 8 percent decrease in demand for the new cars in 2013,more and more vehicle buyers are finding alternatives of buying new cars outright.
#reading data from the csv
library(readr)
train_data = read_csv("./train-data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## Name = col_character(),
## Location = col_character(),
## Year = col_double(),
## Kilometers_Driven = col_double(),
## Fuel_Type = col_character(),
## Transmission = col_character(),
## Owner_Type = col_character(),
## Mileage = col_character(),
## Engine = col_character(),
## Power = col_character(),
## Seats = col_double(),
## New_Price = col_character(),
## Price = col_double()
## )
test_data = read_csv("./test-data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## Name = col_character(),
## Location = col_character(),
## Year = col_double(),
## Kilometers_Driven = col_double(),
## Fuel_Type = col_character(),
## Transmission = col_character(),
## Owner_Type = col_character(),
## Mileage = col_character(),
## Engine = col_character(),
## Power = col_character(),
## Seats = col_double(),
## New_Price = col_character()
## )
The first thing we did was to clean the dataset that we are given. In the dataset, the values for Power, Mileage and Engine are added as strings with their units along with them. So we had to remove those units and convert their types to numeric.
Variables like Location,Transmission, Fuel_Type and Owner_Type can be considered as factor variables therefore they were changed to factor variables.
Columns for X1 and Name were removed as they were not really affecting the price.
#clean data function
clean_data = function(data) {
#removing New Price column
data = data[,-13]
remove_units = function(values) {
remove_units_value = function(value) {
as.numeric(sub("\\s+\\D+$", "", value))
}
unlist(lapply(values, remove_units_value))
}
data$Location = factor(data$Location)
data$Transmission = factor(data$Transmission)
data$Fuel_Type = factor(data$Fuel_Type)
data$Owner_Type = factor(data$Owner_Type)
data$Power = remove_units(data$Power)
data$Mileage = remove_units(data$Mileage)
data$Engine = remove_units(data$Engine)
data = na.omit(data)
data = data[-which(data$Mileage == 0),]
#removing Name and X1 columnns from the dataset
data = data[,-c(1,2)]
return(data)
}
train_data = clean_data(train_data)
test_data = clean_data(test_data)
We split our train_data into trn_data and tst_data. The test_data provided in the file with the dataset is for competition and does not include any Price paramter
smp_size = floor(0.75 * nrow(train_data))
## set the seed to make your partition reproducible
set.seed(42)
train_ind = sample(seq_len(nrow(train_data)), size = smp_size)
trn_data = train_data[train_ind, ]
tst_data = train_data[-train_ind, ]
head(trn_data)
## # A tibble: 6 x 11
## Location Year Kilometers_Driv… Fuel_Type Transmission Owner_Type Mileage
## <fct> <dbl> <dbl> <fct> <fct> <fct> <dbl>
## 1 Coimbat… 2017 24806 Petrol Manual First 18.2
## 2 Coimbat… 2016 44340 Diesel Automatic First 17.0
## 3 Delhi 2012 68038 Diesel Manual First 20.1
## 4 Delhi 2016 22000 Diesel Manual First 24.3
## 5 Ahmedab… 2011 76000 CNG Manual First 26.3
## 6 Mumbai 2015 29631 Petrol Automatic First 18.9
## # … with 4 more variables: Engine <dbl>, Power <dbl>, Seats <dbl>,
## # Price <dbl>
cor(trn_data[c("Kilometers_Driven","Mileage","Engine","Power")])
## Kilometers_Driven Mileage Engine Power
## Kilometers_Driven 1.00000000 -0.05500041 0.08971581 0.04084805
## Mileage -0.05500041 1.00000000 -0.64849776 -0.56117361
## Engine 0.08971581 -0.64849776 1.00000000 0.87573788
## Power 0.04084805 -0.56117361 0.87573788 1.00000000
We can see from the correlation matrix that Power and Engine are very much correlated. Also Engine and Mileage also upto some extent. We will have to be careful before using all of them for our model.
plot(Price ~ Power, data = trn_data,
col = "dodgerblue",
main = "Price and Power relation")
plot(log(Price) ~ Power, data = trn_data,
col = "dodgerblue",
main = "log(Price) and Power relation")
plot(log(Price) ~ log(Power), data = trn_data,
col = "dodgerblue",
main = "log(Price) and log(Power) relation")
plot(Price ~ Mileage, data = trn_data, col = "dodgerblue",
main = "log(Price) and log(Mileage) relation")
plot(log(Price) ~ log(Mileage), data = trn_data, col = "dodgerblue",
main = "log(Price) and log(Mileage) relation")
plot(Price ~ Engine, data = trn_data, col = "dodgerblue",
main = "log(Price) and log(Engine) relation")
plot(log(Price) ~ log(Engine), data = trn_data, col = "dodgerblue",
main = "log(Price) and log(Engine) relation")
plot(Price ~ Kilometers_Driven, data = trn_data, col = "dodgerblue",
main = "log(Price) and log(Engine) relation")
plot(log(Price) ~ log(Kilometers_Driven), data = trn_data, col = "dodgerblue",
main = "log(Price) and log(Engine) relation")
From these plots we were inferred that the logs of the above parameters are better suited for regression.
plot_fit_res = function(model, title = "") {
plot(fitted(model), resid(model), col = "grey", pch = 20,
xlab = "Fitted", ylab = "Residuals", main = title)
abline(h = 0, col = "darkorange", lwd = 2)
}
plot_qq = function(model, title = "") {
qqnorm(resid(model), main = title, col = "darkgrey")
qqline(resid(model), col = "dodgerblue", lwd = 2)
}
plot_fit_pred = function(model, data, title = "") {
predicted = predict(model, newdata = data)
plot(predicted, log(data$Price), col = "grey", pch = 20,
xlab = "Predicted", ylab = "Actual", main = title)
abline(a = 0, b = 1, col = "orange", lwd = 3)
}
calc_aic = function(model) {
rss = sum(resid(model)^2)
n = length(resid(model))
p = length(model$coefficients)
return(n*log(rss/n)+2*p)
}
calc_bic = function(model) {
rss = sum(resid(model)^2)
n = length(resid(model))
p = length(model$coefficients)
return(n*log(rss/n)+(log(n)*p))
}
calc_loocv_rmse = function(model) {
return(sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2)))
}
calc_adj_r2 = function(model) {
mod_sum = summary(model)
return(mod_sum[["adj.r.squared"]])
}
calc_rmse = function(model, data) {
y_hat = exp(predict(model, newdata = data))
y = data$Price
return(sqrt(mean((y_hat - y)^2)))
}
calc_quality_criterions = function(model) {
return(data.frame("AIC" = calc_aic(model),
"BIC" = calc_bic(model),
"LOOCV_RMSE" = calc_loocv_rmse(model),
"ADJ_R_SQ" = calc_adj_r2(model)
))
}
The first model we created was an additive model usig all the parameters.
model1 = lm(Price ~ ., data = trn_data)
calc_quality_criterions(model1)
## AIC BIC LOOCV_RMSE ADJ_R_SQ
## 1 15951 16104.25 7.088395 0.7023269
plot_fit_res(model1, title = "Residuals vs fitted plot for Model1")
plot_qq(model1, title = "QQ plot for Model1")
As we have already seen from the plots that it is better to use the log for numeric parameters we also created the model using the logs for Power, Mileage, Engine and Kilometers driven.
model2 = lm(log(Price) ~ Location+Year+log(Kilometers_Driven)+Fuel_Type+Transmission+Owner_Type+log(Mileage)+log(Engine)+log(Power)+Seats, data = trn_data)
calc_quality_criterions(model2)
## AIC BIC LOOCV_RMSE ADJ_R_SQ
## 1 -10990.32 -10837.07 0.2858198 0.8895492
plot_fit_res(model2, title = "Residuals vs fitted plot for Model2")
plot_qq(model2, title = "QQ plot for Model2")
We were able to improve all the criterias by just using log. AIC, BIC and LOOCV_RMSE decreased and ADJ_R_SQ increased.
We tried to further decrease the AIC and BIC by using the Backward search on the model2.
model3 = step(model2, direction = "backward")
## Start: AIC=-10990.32
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type +
## Transmission + Owner_Type + log(Mileage) + log(Engine) +
## log(Power) + Seats
##
## Df Sum of Sq RSS AIC
## - Seats 1 0.066 353.27 -10991.5
## <none> 353.20 -10990.3
## - log(Engine) 1 1.331 354.54 -10975.8
## - log(Mileage) 1 2.673 355.88 -10959.3
## - Owner_Type 3 4.607 357.81 -10939.5
## - log(Kilometers_Driven) 1 12.203 365.41 -10843.5
## - Fuel_Type 3 32.572 385.78 -10609.7
## - Transmission 1 39.314 392.52 -10529.8
## - Location 10 52.305 405.51 -10405.0
## - log(Power) 1 113.459 466.66 -9771.4
## - Year 1 237.892 591.10 -8735.4
##
## Step: AIC=-10991.5
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type +
## Transmission + Owner_Type + log(Mileage) + log(Engine) +
## log(Power)
##
## Df Sum of Sq RSS AIC
## <none> 353.27 -10991.5
## - log(Engine) 1 1.268 354.54 -10977.8
## - log(Mileage) 1 2.680 355.95 -10960.4
## - Owner_Type 3 4.607 357.88 -10940.7
## - log(Kilometers_Driven) 1 12.537 365.81 -10840.7
## - Fuel_Type 3 33.162 386.43 -10604.3
## - Transmission 1 41.473 394.74 -10507.0
## - Location 10 52.407 405.68 -10405.2
## - log(Power) 1 129.739 483.01 -9622.5
## - Year 1 251.587 604.86 -8636.5
n = length(resid(model2))
model4 = step(model2, direction = "backward", k = log(n))
## Start: AIC=-10837.07
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type +
## Transmission + Owner_Type + log(Mileage) + log(Engine) +
## log(Power) + Seats
##
## Df Sum of Sq RSS AIC
## - Seats 1 0.066 353.27 -10844.6
## <none> 353.20 -10837.1
## - log(Engine) 1 1.331 354.54 -10829.0
## - log(Mileage) 1 2.673 355.88 -10812.4
## - Owner_Type 3 4.607 357.81 -10805.4
## - log(Kilometers_Driven) 1 12.203 365.41 -10696.6
## - Fuel_Type 3 32.572 385.78 -10475.6
## - Transmission 1 39.314 392.52 -10382.9
## - Location 10 52.305 405.51 -10315.6
## - log(Power) 1 113.459 466.66 -9624.5
## - Year 1 237.892 591.10 -8588.5
##
## Step: AIC=-10844.64
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type +
## Transmission + Owner_Type + log(Mileage) + log(Engine) +
## log(Power)
##
## Df Sum of Sq RSS AIC
## <none> 353.27 -10845
## - log(Engine) 1 1.268 354.54 -10837
## - log(Mileage) 1 2.680 355.95 -10820
## - Owner_Type 3 4.607 357.88 -10813
## - log(Kilometers_Driven) 1 12.537 365.81 -10700
## - Fuel_Type 3 33.162 386.43 -10476
## - Transmission 1 41.473 394.74 -10366
## - Location 10 52.407 405.68 -10322
## - log(Power) 1 129.739 483.01 -9482
## - Year 1 251.587 604.86 -8496
calc_quality_criterions(model4)
## AIC BIC LOOCV_RMSE ADJ_R_SQ
## 1 -10991.5 -10844.64 0.285739 0.8895538
Using model4 we were able to reduce the LOOCV_RMSE and the ADJ_R_SQ further with minimal effect on AIC.
From the correlation testng earlier we had found that Engine and Power are highly corrlated. This means we should also try by removing the Engine Parameter and see if it improves anything.
model5 = lm(log(Price) ~ Location+Year+log(Kilometers_Driven)+Fuel_Type+Transmission+Owner_Type+log(Mileage)+log(Power)+Seats, data = trn_data)
model5 = step(model5, direction = "backward", k = log(n))
## Start: AIC=-10828.98
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type +
## Transmission + Owner_Type + log(Mileage) + log(Power) + Seats
##
## Df Sum of Sq RSS AIC
## - Seats 1 0.00 354.54 -10837.3
## <none> 354.54 -10829.0
## - Owner_Type 3 4.56 359.10 -10798.1
## - log(Mileage) 1 6.34 360.88 -10759.7
## - log(Kilometers_Driven) 1 12.09 366.63 -10690.3
## - Transmission 1 38.38 392.91 -10386.9
## - Location 10 52.14 406.67 -10311.5
## - Fuel_Type 3 56.52 411.06 -10205.8
## - Year 1 236.60 591.14 -8596.6
## - log(Power) 1 343.14 697.68 -7870.3
##
## Step: AIC=-10837.32
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type +
## Transmission + Owner_Type + log(Mileage) + log(Power)
##
## Df Sum of Sq RSS AIC
## <none> 354.54 -10837.3
## - Owner_Type 3 4.56 359.10 -10806.5
## - log(Mileage) 1 9.04 363.58 -10735.3
## - log(Kilometers_Driven) 1 12.21 366.75 -10697.3
## - Transmission 1 40.25 394.79 -10374.4
## - Location 10 52.15 406.69 -10319.7
## - Fuel_Type 3 68.24 422.77 -10091.0
## - Year 1 251.81 606.35 -8493.6
## - log(Power) 1 357.65 712.19 -7788.5
calc_quality_criterions(model5)
## AIC BIC LOOCV_RMSE ADJ_R_SQ
## 1 -10977.8 -10837.32 0.286194 0.8891829
plot_fit_res(model5, title = "Residuals vs fitted plot for Model5")
plot_qq(model5, title = "QQ plot for Model5")
plot_fit_pred(model5, trn_data, title = "Predicted vs Actual for Model5")
We then also tried some interaction models
model6 = lm(log(Price) ~ (Location+Year+log(Kilometers_Driven)+Fuel_Type+Transmission+Owner_Type+log(Mileage)+log(Engine)+log(Power)+Seats)^2, data = trn_data)
model7 = step(model6, direction = "backward", trace = FALSE)
calc_quality_criterions(model7)
## AIC BIC LOOCV_RMSE ADJ_R_SQ
## 1 -11908.8 -11346.88 Inf 0.9117736
plot_fit_res(model7, title = "Residuals vs fitted plot for Model7")
plot_qq(model7, title = "QQ plot for Model7")
plot_fit_pred(model7, trn_data, title = "Predicted vs Actual for Model7")
## Warning in predict.lm(model, newdata = data): prediction from a rank-
## deficient fit may be misleading
Using model7 we were further able to decrease the AIC and BIC from the additive models.
We were able to find 2 models that were giving us very good results. One is additive and another is an interaction model.
model_add = model5
calc_quality_criterions(model_add)
## AIC BIC LOOCV_RMSE ADJ_R_SQ
## 1 -10977.8 -10837.32 0.286194 0.8891829
plot_fit_res(model_add, title = "Residuals vs fitted plot for Additive model")
plot_qq(model_add, title = "QQ plot for Additive model")
plot_fit_pred(model_add, trn_data, title = "Predicted vs Actual for Additive Model For train data")
plot_fit_pred(model_add, tst_data, title = "Predicted vs Actual for Additive Model For test data")
calc_rmse(model_add, tst_data)
## [1] 4.387601
model_int = model7
calc_quality_criterions(model_int)
## AIC BIC LOOCV_RMSE ADJ_R_SQ
## 1 -11908.8 -11346.88 Inf 0.9117736
plot_fit_res(model_int, title = "Residuals vs fitted plot for Interaction model")
plot_qq(model_int, title = "QQ plot for Interaction model")
plot_fit_pred(model_int, trn_data, title = "Predicted vs Actual for Interaction model for train data")
plot_fit_pred(model_add, tst_data, title = "Predicted vs Actual for Interaction Model For test data")
calc_rmse(model_int, tst_data)
## [1] 3.627951
In context of the data, both the models use the log(Price) as the response which as shown earlier gives a better result. Parameters like Name and ID were removed from both the models. The numerical values like Kilometers_driven, Engine, Power and Mileage were shown to waork bettter when used with log. There were many dummy parameters that were used to accomodate the factor variables like Owner_Type, Transmission, Location.
The AIC and BIC values for both the models are less which is good. The LOOCV_RMSE values are also less. The adjusted r squared values are close to 1. The residual vs fitted plot shows that both the models follow the equal assumption to some extent and the qq plots shows that the models also follow normality assumption.
The Interaction model has less RMSE than Additive model, whcih means that the interaction model gives better results than the Additive model.
library(readr)
#reading data from the csv
train_data = read_csv("./train-data.csv")
test_data = read_csv("./test-data.csv")
#clean data function
clean_data = function(data) {
#removing New Price column
data = data[,-13]
remove_units = function(values) {
remove_units_value = function(value) {
as.numeric(sub("\\s+\\D+$", "", value))
}
unlist(lapply(values, remove_units_value))
}
data$Location = factor(data$Location)
data$Transmission = factor(data$Transmission)
data$Fuel_Type = factor(data$Fuel_Type)
data$Owner_Type = factor(data$Owner_Type)
data$Power = remove_units(data$Power)
data$Mileage = remove_units(data$Mileage)
data$Engine = remove_units(data$Engine)
data = na.omit(data)
data = data[-which(data$Mileage == 0),]
data = data[,-c(1,2)]
return(data)
}
loocv_rmse = function(model) {
sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2))
}
train_data = clean_data(train_data)
test_data = clean_data(test_data)
pairs(train_data[c("Power","Mileage","Engine","Price")])
plot(log(Price) ~ log(Power), data = train_data)
train_data_nm = train_data
mileage_transform = function(x) {
return(-x)
}
plot(log(Price) ~ mileage_transform(Mileage), data = train_data_nm)
plot(log(Price) ~ log(Engine), data = train_data)
train_data_nm = train_data
train_data_nm = train_data[-which.max(train_data$Kilometers_Driven),]
plot(log(Price) ~ log(Kilometers_Driven), data = train_data_nm)
train_data = train_data[,-c(1,2)]
model = lm(Price ~ ., data = train_data)
model_aic = step(model, direction = "backward")
anova(model_aic, model)
log_model = lm(log(Price) ~ .,data = train_data)
log_model_aic = step(log_model, direction = "backward")
loocv_rmse(log_model)
loocv_rmse(log_model_aic)
sqrt(mean(resid(log_model_aic)^2))
sqrt(mean(resid(model_aic)^2))
loocv_rmse(model_aic)
log_model_int = lm(log(Price) ~ (.)^2, data = train_data)
log_model_int_aic = step(log_model_int, direction = "backward")
loocv_rmse(log_model_int_aic)
anova(log_model_aic, log_model_int_aic)
sqrt(mean(resid(log_model_int_aic)^2))
price_test_hat = exp(predict(log_model_int_aic, newdata = test_data))
train_data = clean_data(train_data)
test_data = clean_data(test_data)
smp_size = floor(0.75 * nrow(train_data))
## set the seed to make your partition reproducible
set.seed(42)
train_ind = sample(seq_len(nrow(train_data)), size = smp_size)
trn_data = train_data[train_ind, ]
tst_data = train_data[-train_ind, ]
plot_fit_res = function(model, title = "") {
plot(fitted(model), resid(model), col = "grey", pch = 20,
xlab = "Fitted", ylab = "Residuals", main = title)
abline(h = 0, col = "darkorange", lwd = 2)
}
plot_qq = function(model, title = "") {
qqnorm(resid(model), main = title, col = "darkgrey")
qqline(resid(model), col = "dodgerblue", lwd = 2)
}
plot_fit_pred = function(model, data, title = "") {
predicted = predict(model, newdata = data)
plot(predicted, log(data$Price), col = "grey", pch = 20,
xlab = "Predicted", ylab = "Actual", main = title)
abline(a = 0, b = 1, col = "orange", lwd = 3)
}
calc_aic = function(model) {
rss = sum(resid(model)^2)
n = length(resid(model))
p = length(model$coefficients)
return(n*log(rss/n)+2*p)
}
calc_bic = function(model) {
rss = sum(resid(model)^2)
n = length(resid(model))
p = length(model$coefficients)
return(n*log(rss/n)+(log(n)*p))
}
calc_loocv_rmse = function(model) {
return(sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2)))
}
calc_adj_r2 = function(model) {
mod_sum = summary(model)
return(mod_sum[["adj.r.squared"]])
}
calc_rmse = function(model, data) {
y_hat = exp(predict(model, newdata = data))
y = data$Price
return(sqrt(mean((y_hat - y)^2)))
}
calc_quality_criterions = function(model) {
return(data.frame("AIC" = calc_aic(model),
"BIC" = calc_bic(model),
"LOOCV_RMSE" = calc_loocv_rmse(model),
"ADJ_R_SQ" = calc_adj_r2(model)
))
}